set.seed(42)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
setwd("~/Projects/companies-bankruptcy-forecast/src")
bankruptcy_data <- read.csv('../data/bankruptcy_Train.csv')
head(bankruptcy_data)
#summary(bankruptcy_data)
na.omit(bankruptcy_data)
# removing those observation rows with 0 in any of the variables
for (i in 1:64) {
bankruptcy_data <- bankruptcy_data[which(bankruptcy_data[, i] != 0), ]
}
dim(bankruptcy_data)
[1] 10000 65
# scale the covariates for easier comparison of coefficient posteriors
for (i in 1:64) {
bankruptcy_data[i] <- scale(bankruptcy_data[i])
}
dim(bankruptcy_data)
[1] 10000 65
bankruptcy_data$class <- factor(bankruptcy_data$class)
# preparing the inputs
x <- model.matrix(class ~ . - 1, data = bankruptcy_data)
y <- bankruptcy_data$class
dim(bankruptcy_data)
[1] 10000 65
head(bankruptcy_data)
bankruptcy_small <- bankruptcy_data %>% group_by(class) %>% sample_frac(.8)
typeof(bankruptcy_data)
[1] "list"
table(bankruptcy_small$class)
0 1
7838 162
bankruptcy_train <- bankruptcy_small %>% group_by(class) %>% sample_frac(.70)
bankruptcy_train
table(bankruptcy_train$class)
0 1
5487 113
bankruptcy_test <- anti_join(bankruptcy_small %>% group_by(class) %>% sample_frac(.90), bankruptcy_train)
Joining, by = c("Attr1", "Attr2", "Attr3", "Attr4", "Attr5", "Attr6", "Attr7", "Attr8", "Attr9", "Attr10", "Attr11", "Attr12", "Attr13", "Attr14", "Attr15", "Attr16", "Attr17", "Attr18", "Attr19", "Attr20", "Attr21", "Attr22", "Attr23", "Attr24", "Attr25", "Attr26", "Attr27", "Attr28", "Attr29", "Attr30", "Attr31", "Attr32", "Attr33", "Attr34", "Attr35", "Attr36", "Attr37", "Attr38", "Attr39", "Attr40", "Attr41", "Attr42", "Attr43", "Attr44", "Attr45", "Attr46", "Attr47", "Attr48", "Attr49", "Attr50", "Attr51", "Attr52", "Attr53", "Attr54", "Attr55", "Attr56", "Attr57", "Attr58", "Attr59", "Attr60", "Attr61", "Attr62", "Attr63", "Attr64", "class")
bankruptcy_test
table(bankruptcy_test$class)
0 1
2105 47
n=dim(bankruptcy_train)[1]
p=dim(bankruptcy_train)[2]
#str(bankruptcy_train)
# n=dim(bankruptcy_data)[1]
# p=dim(bankruptcy_data)[2]
dim(bankruptcy_train)
[1] 5600 65
names(bankruptcy_train)
[1] "Attr1" "Attr2" "Attr3" "Attr4" "Attr5" "Attr6" "Attr7" "Attr8" "Attr9"
[10] "Attr10" "Attr11" "Attr12" "Attr13" "Attr14" "Attr15" "Attr16" "Attr17" "Attr18"
[19] "Attr19" "Attr20" "Attr21" "Attr22" "Attr23" "Attr24" "Attr25" "Attr26" "Attr27"
[28] "Attr28" "Attr29" "Attr30" "Attr31" "Attr32" "Attr33" "Attr34" "Attr35" "Attr36"
[37] "Attr37" "Attr38" "Attr39" "Attr40" "Attr41" "Attr42" "Attr43" "Attr44" "Attr45"
[46] "Attr46" "Attr47" "Attr48" "Attr49" "Attr50" "Attr51" "Attr52" "Attr53" "Attr54"
[55] "Attr55" "Attr56" "Attr57" "Attr58" "Attr59" "Attr60" "Attr61" "Attr62" "Attr63"
[64] "Attr64" "class"
typeof(bankruptcy_train)
[1] "list"
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ──────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.2 ✓ purrr 0.3.4
✓ tibble 3.0.3 ✓ stringr 1.4.0
✓ tidyr 1.1.1 ✓ forcats 0.5.0
✓ readr 1.3.1
── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(caret)
Loading required package: lattice
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
library(GGally)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
library(ggplot2)
library(corrplot)
corrplot 0.84 loaded
library(bayesplot)
This is bayesplot version 1.7.2
- Online documentation and vignettes at mc-stan.org/bayesplot
- bayesplot theme set to bayesplot::theme_default()
* Does _not_ affect other ggplot2 plots
* See ?bayesplot_theme_set for details on theme setting
theme_set(bayesplot::theme_default(base_family = "sans"))
library(rstanarm)
Loading required package: Rcpp
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
This is rstanarm version 2.21.1
- See https://mc-stan.org/rstanarm/articles/priors for changes to default priors!
- Default priors may change, so it's safest to specify priors, even if equivalent to the defaults.
- For execution on a local, multicore CPU with excess RAM we recommend calling
options(mc.cores = parallel::detectCores())
Attaching package: ‘rstanarm’
The following objects are masked from ‘package:caret’:
compare_models, R2
options(mc.cores = parallel::detectCores())
library(loo)
This is loo version 2.3.1
- Online documentation and vignettes at mc-stan.org/loo
- As of v2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session.
library(projpred)
This is projpred version 1.1.6.
SEED=42
library(broom)
dim(bankruptcy_train)
[1] 5600 65
t_prior <- student_t(df = 7, location = 0, scale = 2.5)
## Development
post1 <- stan_glm(class ~ . , data = bankruptcy_train,
family = binomial(link = "logit"),
prior = t_prior, prior_intercept = t_prior,
cores=4, seed = 42)
starting worker pid=4838 on localhost:11290 at 22:31:40.862
starting worker pid=4852 on localhost:11290 at 22:31:41.455
starting worker pid=4866 on localhost:11290 at 22:31:41.987
starting worker pid=4880 on localhost:11290 at 22:31:42.508
SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
Chain 1:
Chain 1: Gradient evaluation took 0.001206 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 12.06 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1:
Chain 1:
Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
Chain 2:
Chain 2: Gradient evaluation took 0.000917 seconds
Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 9.17 seconds.
Chain 2: Adjust your expectations accordingly!
Chain 2:
Chain 2:
Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
Chain 3:
Chain 3: Gradient evaluation took 0.000924 seconds
Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 9.24 seconds.
Chain 3: Adjust your expectations accordingly!
Chain 3:
Chain 3:
Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
Chain 4:
Chain 4: Gradient evaluation took 0.001083 seconds
Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 10.83 seconds.
Chain 4: Adjust your expectations accordingly!
Chain 4:
Chain 4:
Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3:
Chain 3: Elapsed Time: 1113.53 seconds (Warm-up)
Chain 3: 797.956 seconds (Sampling)
Chain 3: 1911.49 seconds (Total)
Chain 3:
Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1:
Chain 1: Elapsed Time: 1151.39 seconds (Warm-up)
Chain 1: 883.082 seconds (Sampling)
Chain 1: 2034.47 seconds (Total)
Chain 1:
Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4:
Chain 4: Elapsed Time: 1113.36 seconds (Warm-up)
Chain 4: 1026.71 seconds (Sampling)
Chain 4: 2140.08 seconds (Total)
Chain 4:
Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2:
Chain 2: Elapsed Time: 1136.16 seconds (Warm-up)
Chain 2: 1217.05 seconds (Sampling)
Chain 2: 2353.21 seconds (Total)
Chain 2:
## ALL DATA
# post1 <- stan_glm(class ~ . , data = bankruptcy_data,
# family = binomial(link = "logit"),
# prior = t_prior, prior_intercept = t_prior,
# seed = 42, cores=4)
summary(post1)
Model Info:
function: stan_glm
family: binomial [logit]
formula: class ~ .
algorithm: sampling
sample: 4000 (posterior sample size)
priors: see help('prior_summary')
observations: 5600
predictors: 65
Estimates:
mean sd 10% 50% 90%
(Intercept) -7.1 0.4 -7.6 -7.1 -6.6
Attr1 -3.6 2.0 -6.3 -3.6 -1.1
Attr2 2.0 2.4 -1.1 1.8 5.0
Attr3 0.7 0.3 0.3 0.6 1.0
Attr4 1.2 1.5 -0.7 1.3 3.1
Attr5 0.3 0.3 0.0 0.3 0.7
Attr6 -0.1 0.3 -0.4 -0.1 0.3
Attr7 -0.8 3.1 -4.4 -0.7 2.9
Attr8 1.1 1.6 -0.9 1.1 3.2
Attr9 -0.8 0.5 -1.4 -0.8 -0.1
Attr10 -1.1 2.1 -3.8 -1.1 1.5
Attr11 0.4 0.6 -0.3 0.4 1.1
Attr12 1.0 0.9 -0.2 0.9 2.2
Attr13 -2.1 2.1 -4.9 -1.7 0.0
Attr14 -0.8 3.0 -4.4 -0.7 2.7
Attr15 0.1 0.2 -0.2 0.1 0.4
Attr16 0.3 1.4 -1.5 0.3 2.0
Attr17 -1.0 1.7 -3.2 -1.0 1.1
Attr18 -1.0 3.1 -4.7 -0.9 2.5
Attr19 -0.8 2.4 -3.9 -0.8 2.1
Attr20 -0.1 1.4 -2.0 -0.1 1.6
Attr21 -2.2 1.9 -4.6 -1.8 -0.3
Attr22 -2.1 0.9 -3.3 -2.1 -0.9
Attr23 -1.0 2.5 -4.2 -0.9 1.9
Attr24 0.2 0.2 0.0 0.2 0.4
Attr25 -0.3 0.4 -0.8 -0.4 0.2
Attr26 -1.1 1.3 -2.8 -1.1 0.6
Attr27 -2.1 1.9 -4.6 -1.7 -0.2
Attr28 -0.1 2.3 -2.9 -0.1 2.7
Attr29 0.2 0.1 0.0 0.2 0.3
Attr30 -0.8 1.5 -2.8 -0.7 1.0
Attr31 3.1 2.4 0.2 3.0 6.3
Attr32 -1.7 1.5 -3.8 -1.4 -0.1
Attr33 4.4 2.6 1.2 4.2 7.8
Attr34 3.2 0.4 2.6 3.2 3.7
Attr35 -0.5 0.3 -0.9 -0.5 -0.2
Attr36 0.4 0.5 -0.3 0.4 1.0
Attr37 -1.1 1.1 -2.6 -0.8 0.0
Attr38 -1.2 2.7 -4.6 -1.1 2.0
Attr39 -4.8 1.4 -6.6 -4.8 -3.0
Attr40 2.7 1.1 1.4 2.7 4.0
Attr41 0.0 0.1 -0.1 0.0 0.1
Attr42 -1.5 2.3 -4.5 -1.4 1.2
Attr43 -1.0 2.1 -3.7 -0.9 1.7
Attr44 -1.6 1.4 -3.3 -1.5 0.1
Attr45 -0.8 2.4 -3.9 -0.6 2.2
Attr46 -4.7 1.7 -7.0 -4.7 -2.6
Attr47 -1.1 2.4 -4.2 -0.9 1.6
Attr48 3.7 1.0 2.5 3.7 5.0
Attr49 0.3 1.9 -2.1 0.3 2.8
Attr50 -2.4 0.7 -3.3 -2.4 -1.5
Attr51 -0.1 0.3 -0.6 -0.2 0.3
Attr52 -1.6 2.2 -4.3 -1.5 1.1
Attr53 0.4 0.4 -0.1 0.3 1.0
Attr54 -1.2 2.4 -4.3 -1.1 1.6
Attr55 0.1 0.1 -0.1 0.1 0.2
Attr56 5.4 1.2 3.9 5.4 7.0
Attr57 0.0 0.1 -0.2 0.0 0.2
Attr58 -1.3 1.4 -3.0 -1.3 0.5
Attr59 -0.3 0.2 -0.6 -0.2 0.0
Attr60 -2.5 1.8 -4.9 -2.1 -0.5
Attr61 -0.4 0.3 -0.9 -0.4 -0.1
Attr62 2.0 1.9 -0.4 1.9 4.4
Attr63 -10.0 2.9 -13.8 -9.8 -6.5
Attr64 -0.1 1.0 -1.4 -0.1 1.1
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 0.0 0.0 0.0 0.0 0.0
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 2873
Attr1 0.0 1.0 3149
Attr2 0.0 1.0 3694
Attr3 0.0 1.0 2710
Attr4 0.0 1.0 3153
Attr5 0.0 1.0 2255
Attr6 0.0 1.0 2923
Attr7 0.0 1.0 5283
Attr8 0.0 1.0 2817
Attr9 0.0 1.0 2650
Attr10 0.0 1.0 3508
Attr11 0.0 1.0 3432
Attr12 0.0 1.0 3656
Attr13 0.0 1.0 2318
Attr14 0.0 1.0 5555
Attr15 0.0 1.0 6639
Attr16 0.0 1.0 2955
Attr17 0.0 1.0 2847
Attr18 0.0 1.0 4466
Attr19 0.0 1.0 4288
Attr20 0.0 1.0 2384
Attr21 0.0 1.0 2064
Attr22 0.0 1.0 2628
Attr23 0.0 1.0 3940
Attr24 0.0 1.0 2818
Attr25 0.0 1.0 4044
Attr26 0.0 1.0 2803
Attr27 0.0 1.0 2187
Attr28 0.0 1.0 3491
Attr29 0.0 1.0 4999
Attr30 0.0 1.0 3795
Attr31 0.0 1.0 3960
Attr32 0.0 1.0 2625
Attr33 0.0 1.0 2887
Attr34 0.0 1.0 2684
Attr35 0.0 1.0 3718
Attr36 0.0 1.0 2452
Attr37 0.0 1.0 1845
Attr38 0.0 1.0 4503
Attr39 0.0 1.0 2736
Attr40 0.0 1.0 2340
Attr41 0.0 1.0 3684
Attr42 0.0 1.0 2888
Attr43 0.0 1.0 2643
Attr44 0.0 1.0 2359
Attr45 0.0 1.0 4973
Attr46 0.0 1.0 2413
Attr47 0.0 1.0 3215
Attr48 0.0 1.0 2695
Attr49 0.0 1.0 3819
Attr50 0.0 1.0 2460
Attr51 0.0 1.0 2601
Attr52 0.0 1.0 3577
Attr53 0.0 1.0 2243
Attr54 0.0 1.0 3740
Attr55 0.0 1.0 4196
Attr56 0.0 1.0 2820
Attr57 0.0 1.0 4416
Attr58 0.0 1.0 3704
Attr59 0.0 1.0 2276
Attr60 0.0 1.0 2662
Attr61 0.0 1.0 3455
Attr62 0.0 1.0 3726
Attr63 0.1 1.0 2527
Attr64 0.0 1.0 2728
mean_PPD 0.0 1.0 3909
log-posterior 0.2 1.0 1316
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
pp_check(post1, "dens_overlay")
pp_check(post1, "stat")
### ALL TEST DATA
bankruptcy_test <- read_csv('../data/bankruptcy_Test_X.csv')
Parsed with column specification:
cols(
.default = col_double()
)
See spec(...) for full column specifications.
dim(bankruptcy_test)
[1] 2152 65
table(bankruptcy_test$class)
0 1
2105 47
bankruptcy_test_x <- select(bankruptcy_test, -ID) # Remove ID Column
dim(bankruptcy_test_x)
[1] 5000 64
bankruptcy_test <- ungroup(bankruptcy_test)
dim(bankruptcy_test)
[1] 2152 65
bankruptcy_test_x <- select(bankruptcy_test, -class)
dim(bankruptcy_test_x)
[1] 2152 64
posterior <- posterior_predict(post1, newdata = bankruptcy_test_x)
dim(posterior)
[1] 4000 2152
hist(posterior)
#bankruptcy_test$class
pred <- colMeans(posterior)
pr <- as.integer(pred >= 0.5)
table(pr)
pr
0 1
2150 2
true_pr <- bankruptcy_test$class
table(true_pr)
true_pr
0 1
2105 47
table(true_pr, pr)
pr
true_pr 0 1
0 2104 1
1 46 1
library(MLmetrics)
Attaching package: ‘MLmetrics’
The following objects are masked from ‘package:caret’:
MAE, RMSE
The following object is masked from ‘package:base’:
Recall
ConfusionMatrix(pr, true_pr)
y_pred
y_true 0 1
0 2104 1
1 46 1
Precision(true_pr, pr)
[1] 0.9786047
Recall(true_pr, pr)
[1] 0.9995249
#Store Train and Test Dataframes for future model comparison
dim(bankruptcy_train)
[1] 5600 65
write.csv(bankruptcy_train, "../data/bankruptcy_train_am.csv")
dim(bankruptcy_test)
[1] 2152 65
write.csv(bankruptcy_test, "../data/bankruptcy_test_am.csv")